Process: Exploring the data on indeed.com and feature engineering the results, the fields before using dummy variables for the regression model were:
I commented most cells explaining the reason behind doing the commands. This should be able to illustrate my overall mindset on why a method was used. From there, I perfomed some regression modeling and made some visualizations.
import pandas as pd
import numpy as np
import requests
import io
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from itertools import zip_longest
from time import sleep
from math import sqrt
import re
import time
import random
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
# put data into a list while addressing None type values
def get_title(titles):
# get the job title for each job
title_list = []
try:
for t in titles:
title_list.append(t.text)
except None:
title_list.append("data_not_available")
title_list = ['' if v is None else v for v in title_list]
return title_list
# put data into a list while addressing None type values
def get_location(locations):
# get the location for each job
location_list = []
try:
for location in locations:
location_list.append(location.text)
except None:
location_list.append("data_not_available")
location_list = ['' if v is None else v for v in location_list]
return location_list
# put data into a list while addressing None type values
def get_company(companies):
# get the lcoation for each job
company_list = []
try:
for company in companies:
company_list.append(company.text)
except None:
company_list.append("data_not_available")
company_list = ['' if v is None else v for v in company_list]
return company_list
# put data into a list while addressing None type values
def get_salary(salaries):
salary_list = []
for i,v in enumerate(salaries):
tmp_list = []
# yearly salary extraction
if '$' in v.text and 'year' in v.text:
a = v.text.replace('$', '').replace(',', '')
for val1 in a.split(" "):
try:
if val1.isdigit():
tmp_list.append(int(val1))
else:
tmp_list.append(0)
except None:
tmp_list.append(0)
# since some have a range, take only the max value
salary_list.append(max(tmp_list))
# hourly pay extraction
if '$' in v.text and 'hour' in v.text:
b = v.text.replace('$', '').replace(',', '')
for val2 in b.split(" "):
try:
if val2.isdigit():
tmp_list.append(int(val2))
else:
tmp_list.append(0)
# convert hourly to yearly
except None:
tmp_list.append(0)
# since some have a range, take only the max value and convert to a yearly salary
salary_list.append(max(tmp_list) * 40 * 52)
else:
salary_list.append("data_not_available")
salary_list = ['' if v is None else v for v in salary_list]
return salary_list
# put data into a list while addressing None type values
def get_link(job_links):
# get the links for each job (this is commented out when being called as it's optional for later)
link_list = []
try:
for links in job_links:
href = (links['href'])
new_url = base_url + href
link_list.append(new_url)
except None:
link_list.append("data_not_available")
link_list = ['' if v is None else v for v in link_list]
return link_list
# put data into a list while addressing None type values
def get_rating(ratings):
# get the ratings for each job's company
rating_list = []
try:
for rating in ratings:
rating_list.append(rating.text)
except None:
rating_list.append("data_not_available")
rating_list = ['' if v is None else v for v in rating_list]
return rating_list
# put data into a list while addressing None type values
def get_location_counts(location_counts):
# get the ratings for each job's company
location_counts_list = []
try:
for location_count in location_counts:
location_counts_list.append(location_count.text)
except None:
location_counts_list.append("data_not_available")
location_counts_list = ['' if v is None else v for v in location_counts_list]
return location_counts_list
# put data into a list while addressing None type values
def get_desc(job_desc):
# get the links for each job (this is commented out when being called as it's optional for later
desc_list = []
try:
for desc in job_desc:
desc_list.append(desc.text)
except None:
desc_list.append("data_not_available")
desc_list = ['' if v is None else v for v in desc_list]
return desc_list
# ref: https://towardsdatascience.com/statistics-in-python-collinearity-and-multicollinearity-4cc4dcd82b3f
def calculate_vif(df, features):
vif, tolerance = {}, {}
# all features wanted in determining target
for feature in features:
# extract all the other features you will regress against
X = [f for f in features if f != feature]
X, y = df[X], df[feature]
# extract r-squared from the fit
r2 = LinearRegression().fit(X, y).score(X, y)
# calculate tolerance
tolerance[feature] = 1 - r2
# calculate VIF
vif[feature] = 1/(tolerance[feature])
# return VIF DataFrame
return pd.DataFrame({'VIF': vif, 'Tolerance': tolerance})
# based on https://www.youtube.com/watch?v=Q_nKKx8L_qE
headers = {"User Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.37"}
base_url = "https://www.indeed.com"
start_url = "https://www.indeed.com/jobs?q=data%20scientist&start={}"
job_list = []
location_list = []
salary_list = []
company_list = []
rating_list = []
location_counts_list = []
link_list = []
desc_list = []
for page_num in range(0, 1000, 10):
#print(page_num)
r = requests.get(start_url.format(page_num), headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
titles = soup.find_all(class_="jobTitle")
locations = soup.find_all(class_="companyLocation")
salaries = soup.find_all(class_='attribute_snippet')
companies = soup.find_all(class_="companyName")
ratings = soup.find_all(class_="ratingNumber")
location_counts = soup.find_all(class_="more_loc_container")
links = soup.find_all(class_=re.compile('tapItem fs-unmask result job_'))
description = soup.find_all(class_="job-snippet")
job_list.append(get_title(titles))
location_list.append(get_location(locations))
salary_list.append(get_salary(salaries))
company_list.append(get_company(companies))
rating_list.append(get_rating(ratings))
location_counts_list.append(get_location_counts(location_counts))
link_list.append(get_link(links))
desc_list.append(get_desc(description))
# wait a little before making next request
#time.sleep(.725456189 + random.random())
# combine all lists into one respective list to make it easier to put it into the dataframe
# I tried putting into a list comprehension or a function and failed so sticking with this for now
final_job_list = []
for j in job_list:
final_job_list += j
final_location_list = []
for l in location_list:
final_location_list += l
final_salary_list = []
for s in salary_list:
final_salary_list += s
final_company_list = []
for c in company_list:
final_company_list += c
final_rating_list = []
for r in rating_list:
final_rating_list += r
final_location_counts_list = []
for f in location_counts_list:
final_location_counts_list += f
final_link_list = []
for k in link_list:
final_link_list += k
final_desc_list = []
for d in desc_list:
final_desc_list += d
# since all data are in lists, make them a tuple for pandas ingestion
#data_tuples = zip(final_job_list, final_location_list, final_salary_list, final_company_list, final_rating_list, final_location_counts_list, final_link_list, final_desc_list)
data_tuples = zip(final_job_list, final_location_list, final_salary_list, final_company_list, final_rating_list, final_location_counts_list)
# create a dataframe with the data tuple
#df = pd.DataFrame(data_tuples, columns=["job_title", "location", "salary", "company", "company_rating", "job_locations_available", "job_link", "job_description"])
df = pd.DataFrame(data_tuples, columns=["job_title", "location", "salary", "company", "company_rating", "job_locations_available"])
# save to csv for later
df.to_csv('indeed_salary_data.csv', mode='a', index=False, header=False)
# set views for notebook
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
# now that there'a a csv from the scrape, read from that instead
df = pd.read_csv('indeed_salary_data.csv')
df.drop_duplicates(inplace=True)
df.sample(n=5, random_state=42).style
| job_title | location | salary | company | company_rating | job_locations_available | |
|---|---|---|---|---|---|---|
| 7678 | newRemote - Staff /Lead Data Scientist | Remote in Frisco, TX 75033 75033+6 locations | 150000 | Shopify | 4.100000 | +2 locations |
| 8280 | Director, Data Science | Remote | data_not_available | Slack | 3.400000 | +1 location |
| 9199 | Principal Data Scientist | Remote in Colorado+43 locations | data_not_available | Verizon | 3.700000 | +13 locations |
| 9895 | Junior Data Scientist | Remote in New York, NY | data_not_available | SmartAsset | 4.200000 | +3 locations |
| 9401 | newData Scientist | United States | data_not_available | Fund For Public Health In New York Inc | 4.200000 | +1 location |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 11465 entries, 0 to 11662 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 job_title 11465 non-null object 1 location 11465 non-null object 2 salary 11465 non-null object 3 company 11465 non-null object 4 company_rating 11465 non-null float64 5 job_locations_available 11465 non-null object dtypes: float64(1), object(5) memory usage: 627.0+ KB
df['job_locations_available'].value_counts().head()
+1 location 2966 +2 locations 1837 +4 locations 1186 +3 locations 1003 +6 locations 496 Name: job_locations_available, dtype: int64
# clean df['job_locations_available'] to make it a float
df['job_locations_available'] = \
df['job_locations_available']\
.str.replace('locations', '', regex=False)\
.str.replace('location', '', regex=False)\
.str.replace('+', '', regex=False)\
.str.replace(' ', '', regex=False).astype(float)
df.company_rating.describe()
count 11465.000000 mean 3.751409 std 0.532293 min 1.000000 25% 3.500000 50% 3.800000 75% 4.100000 max 5.000000 Name: company_rating, dtype: float64
# check the target
df['salary'].value_counts().head()
data_not_available 9085 150000 184 160000 104 120000 104 180000 82 Name: salary, dtype: int64
# get only values with a salary and make it a float since there's a lot of "data_not_available"
mask = df['salary'] != "data_not_available"
df = df[mask]
df.reset_index(drop=True, inplace=True)
df['salary'] = df['salary'].astype(float)
df.sample(n=5, random_state=42).style
| job_title | location | salary | company | company_rating | job_locations_available | |
|---|---|---|---|---|---|---|
| 1413 | IT Specialty Resource/Data Scientist - Richmond, VA (DE1JP00... | Richmond, VA | 0.000000 | BCP Engineers & Consultants | 3.600000 | 1.000000 |
| 2168 | newDoD Senior Data Scientist, Managing Consultant | Jessup, MD 20794 20794+10 locations | 141440.000000 | Guidehouse | 4.200000 | 97.000000 |
| 2002 | Data Science Co-Op | Princeton, NJ+5 locations | 170000.000000 | Bristol Myers Squibb | 3.400000 | 24.000000 |
| 1996 | Lead/Principal Data Scientist | +9 locationsRemote | 197600.000000 | Salesforce | 3.700000 | 1.000000 |
| 1103 | Senior Statistical Programmer | Carolina, PR+2 locations | 150000.000000 | Worldwide Clinical Trials - USA | 3.300000 | 3.000000 |
# clean the df['job_title'] column
df['job_title'] = df['job_title'].str.lower()
df['job_title'] = df['job_title']\
.replace('^new', '', regex=True)\
.replace('\(all levels\)|\(remote\)', '', regex=True)\
.replace('^ ', '', regex=True)\
.replace(' $', '', regex=True)
# clean the df['location'] column to start extracting only states
df['location'] = df['location'].str.lower()
df['location'] =\
df['location']\
.replace('\+[0-9]{1,}? location(.)?', '', regex=True)\
.replace('remote in.*', 'remote', regex=True)\
.replace('temporarily', '', regex=True)\
.replace('^emote', 'remote', regex=True)\
.replace('hybrid', '', regex=True)\
.replace('\(downtownarea\)', '', regex=True)\
.replace(',?\s(\w+)?([-\d]+).*', '', regex=True)\
.replace('united states', 'remote', regex=False)\
.replace('remote', 'remote, remote', regex=False)\
.replace('^ ', '', regex=True)\
.replace(' $', '', regex=True)
# dictionary for all state to state abbreviations
# thanks to https://gist.github.com/rogerallen/1583593
state_convert = {'alabama': 'al',
'alaska': 'ak',
'arizona': 'az',
'arkansas': 'ar',
'california': 'ca',
'colorado': 'co',
'connecticut': 'ct',
'delaware': 'de',
'florida': 'fl',
'georgia': 'ga',
'hawaii': 'hi',
'idaho': 'id',
'illinois': 'il',
'indiana': 'in',
'iowa': 'ia',
'kansas': 'ks',
'kentucky': 'ky',
'louisiana': 'la',
'maine': 'me',
'maryland': 'md',
'massachusetts': 'ma',
'michigan': 'mi',
'minnesota': 'mn',
'mississippi': 'ms',
'missouri': 'mo',
'montana': 'mt',
'nebraska': 'ne',
'nevada': 'nv',
'new hampshire': 'nh',
'new jersey': 'nj',
'new mexico': 'nm',
'new york': 'ny',
'north carolina': 'nc',
'north dakota': 'nd',
'ohio': 'oh',
'oklahoma': 'ok',
'oregon': 'or',
'pennsylvania': 'pa',
'rhode island': 'ri',
'south carolina': 'sc',
'south dakota': 'sd',
'tennessee': 'tn',
'texas': 'tx',
'utah': 'ut',
'vermont': 'vt',
'virginia': 'va',
'washington': 'wa',
'west virginia': 'wv',
'wisconsin': 'wi',
'wyoming': 'wy',
'district of columbia': 'dc',
'american samoa': 'as',
'guam': 'gu',
'northern mariana islands': 'mp',
'puerto rico': 'pr',
'united states minor outlying islands': 'um',
'u.s. virgin islands': 'vi'
}
df.sample(n=5, random_state=42).style
| job_title | location | salary | company | company_rating | job_locations_available | |
|---|---|---|---|---|---|---|
| 1413 | it specialty resource/data scientist - richmond, va (de1jp00... | richmond, va | 0.000000 | BCP Engineers & Consultants | 3.600000 | 1.000000 |
| 2168 | dod senior data scientist, managing consultant | jessup, md | 141440.000000 | Guidehouse | 4.200000 | 97.000000 |
| 2002 | data science co-op | princeton, nj | 170000.000000 | Bristol Myers Squibb | 3.400000 | 24.000000 |
| 1996 | lead/principal data scientist | remote, remote | 197600.000000 | Salesforce | 3.700000 | 1.000000 |
| 1103 | senior statistical programmer | carolina, pr | 150000.000000 | Worldwide Clinical Trials - USA | 3.300000 | 3.000000 |
# match up entire state names and replace with the state abbreviation based on the dictionary
df['location'] = df['location'].replace(state_convert)
# based on https://datascienceparichay.com/article/pandas-split-column-by-delimiter/
# split column and add new columns to df
df[['city', 'state']] = df['location'].str.split(',', expand=True)
df['city'] = df['city'].astype(str)
df['state'] = df['state'].astype(str)
# if there's a None value in the df['state'] column, replace it with the city column as it had a state to being with and no city
# thanks https://stackoverflow.com/questions/39903090/efficiently-replace-values-from-a-column-to-another-column-pandas-dataframe
df['state'][df['state'] == 'None'] = df['city']
# drop columns not needed anymore
df.drop(columns=['location', 'city'], inplace=True)
/var/folders/tp/d_3ffh5127l6t9cx918sddb00000gq/T/ipykernel_4977/4047493820.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['state'][df['state'] == 'None'] = df['city']
# get the regions per state (east, west, etc.)
# https://github.com/cphalpert/census-regions/blob/master/us%20census%20bureau%20regions%20and%20divisions.csv
state_regions = pd.read_csv('state_regions.csv')
state_regions = state_regions[['state_code', 'region']]
state_regions = state_regions.set_index('state_code')['region'].to_dict()
# final cleaning of state and convert to region
df['state'] =\
df['state']\
.replace(' *','', regex=True)\
.replace('\(downtownarea\)', '', regex=True)\
.replace(state_regions)
# I don't want Puerto Rico for this
df['state'] = df['state'][df['state'] != 'pr']
# finally, rename the column to region
df.rename(columns={'state': 'region'}, inplace=True)
# check it
df.sample(n=5, random_state=42).style
| job_title | salary | company | company_rating | job_locations_available | region | |
|---|---|---|---|---|---|---|
| 1413 | it specialty resource/data scientist - richmond, va (de1jp00... | 0.000000 | BCP Engineers & Consultants | 3.600000 | 1.000000 | south |
| 2168 | dod senior data scientist, managing consultant | 141440.000000 | Guidehouse | 4.200000 | 97.000000 | south |
| 2002 | data science co-op | 170000.000000 | Bristol Myers Squibb | 3.400000 | 24.000000 | northeast |
| 1996 | lead/principal data scientist | 197600.000000 | Salesforce | 3.700000 | 1.000000 | remote |
| 1103 | senior statistical programmer | 150000.000000 | Worldwide Clinical Trials - USA | 3.300000 | 3.000000 | nan |
# make the title more standarized
df.loc[df['job_title'].str.contains(r'data scien', na=True), 'job_title'] = "data scientist"
df.loc[df['job_title'].str.contains(r'data ana| ana', na=True), 'job_title'] = "data analyst"
df.loc[df['job_title'].str.contains(r'statistic|statistician|stat ', na=True), 'job_title'] = "statistician"
df.loc[df['job_title'].str.contains(r'engineer|ux dev|application dev|dashboard|risk aud', na=True), 'job_title'] = "data engineer"
df.loc[df['job_title'].str.contains(r'manager', na=True), 'job_title'] = "data science / ml manager"
df.loc[df['job_title'].str.contains(r'machine', na=True), 'job_title'] = "machine learning engineer"
df.loc[df['job_title'].str.contains(r'sales|deals', na=True), 'job_title'] = "data science sales"
df.loc[df['job_title'].str.contains(r'lake', na=True), 'job_title'] = "data lake engineer"
df.loc[df['job_title'].str.contains(r'president|avp|director', na=True), 'job_title'] = "data science / ml vp"
df.loc[df['job_title'].str.contains(r'algorithm dev|senior math', na=True), 'job_title'] = "mathematician"
df.loc[df['job_title'].str.contains(r' ai|ai/ml|ml/ai| ?nlp ?|data |scientist|deep | modeler|natural language proc|decision sci', na=True), 'job_title'] = "data scientist"
df['job_title'].value_counts()
data scientist 2033 statistician 262 machine learning engineer 80 mathematician 5 Name: job_title, dtype: int64
df.sample(n=5, random_state=42).style
| job_title | salary | company | company_rating | job_locations_available | region | |
|---|---|---|---|---|---|---|
| 1413 | data scientist | 0.000000 | BCP Engineers & Consultants | 3.600000 | 1.000000 | south |
| 2168 | data scientist | 141440.000000 | Guidehouse | 4.200000 | 97.000000 | south |
| 2002 | data scientist | 170000.000000 | Bristol Myers Squibb | 3.400000 | 24.000000 | northeast |
| 1996 | data scientist | 197600.000000 | Salesforce | 3.700000 | 1.000000 | remote |
| 1103 | statistician | 150000.000000 | Worldwide Clinical Trials - USA | 3.300000 | 3.000000 | nan |
# get a list of all companies
all_companies = df['company'].unique().tolist()
existing_company_data_list = pd.read_csv('company_ratings.csv')['Company'].to_list()
# https://www.csestack.org/compare-two-lists-python-return-non-match-elements/
non_match_a = set(all_companies) - set(existing_company_data_list)
non_match_b = set(existing_company_data_list) - set(all_companies)
companies = list(non_match_a) + list(non_match_b)
list_of_dicts = []
for company in companies:
try:
# create object for chrome options
chrome_options = Options()
base_url = 'https://www.indeed.com/cmp/{}'.format(company)
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 2})
driver = webdriver.Chrome('chromedriver', options = chrome_options)
driver.get(base_url)
delay = 3
# click into page to get the full report
driver.find_element_by_xpath('//*[@id="cmp-container"]/div/div[1]/main/div[2]/div[1]/section/div/div/button').click()
happy_scores = driver.find_element_by_xpath('//*[@id="cmp-container"]/div/div[1]/main/div[2]/div[1]/section/div/div/div[2]').text
my_list = []
for line in io.StringIO(happy_scores):
my_list.append(line.rstrip('\n'))
# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
# split into chunks
my_list = list(zip_longest(*[iter(my_list)]*2, fillvalue=None))
my_list = my_list[0::2]
# https://stackoverflow.com/questions/13384841/swap-values-in-a-tuple-list-inside-a-list-in-python
my_list = [(t[1], t[0]) for t in my_list]
my_list.append(("Company", company))
list_of_dicts.append(dict(my_list))
driver.close()
except:
pass
# list_of_dicts to csv instead of going to get it over and over again
pd.DataFrame.from_dict(list_of_dicts).to_csv('company_ratings.csv', mode='a')
# from the csv created for the company ratings, use it
happy_df = pd.read_csv('company_ratings.csv')
happy_df.columns = happy_df.columns.str.lower()
happy_df.columns
Index(['unnamed: 0', 'work happiness score', 'flexibility', 'learning',
'achievement', 'appreciation', 'inclusion', 'support', 'purpose',
'energy', 'compensation', 'satisfaction', 'management', 'trust',
'belonging', 'stress-free', 'company'],
dtype='object')
# remove unnamed column and rename happiness
del happy_df['unnamed: 0']
happy_df.rename(columns={"work happiness score": "happiness"}, inplace=True)
df = df.merge(happy_df, how='left', on="company").reset_index(drop=True)
# view a histogram to see the distribution of salary
sns.histplot(df['salary'], kde=True)
<AxesSubplot:xlabel='salary', ylabel='Count'>
print(df.salary.min())
print(df.salary.max())
0.0 434000.0
# since the data is skewed right a little, specify percentiles to remove
lowq = df.salary.quantile(q=0.01)
highq = df.salary.quantile(q=0.99)
df.dropna(inplace=True)
df = df[df.salary > lowq].reset_index(drop=True)
df = df[df.salary < highq].reset_index(drop=True)
print(df.salary.min())
print(df.salary.max())
50000.0 300000.0
# view a histogram to see the distribution of salary
sns.histplot(df['salary'], kde=True)
<AxesSubplot:xlabel='salary', ylabel='Count'>
convert_col = {
'happiness': float,
'flexibility': float,
'learning': float,
'achievement': float,
'appreciation': float,
'inclusion': float,
'support': float,
'purpose': float,
'energy': float,
'compensation': float,
'satisfaction': float,
'management': float,
'trust': float,
'belonging': float,
'stress-free': float
}
df = df.astype(convert_col)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1001 entries, 0 to 1000 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 job_title 1001 non-null object 1 salary 1001 non-null float64 2 company 1001 non-null object 3 company_rating 1001 non-null float64 4 job_locations_available 1001 non-null float64 5 region 1001 non-null object 6 happiness 1001 non-null float64 7 flexibility 1001 non-null float64 8 learning 1001 non-null float64 9 achievement 1001 non-null float64 10 appreciation 1001 non-null float64 11 inclusion 1001 non-null float64 12 support 1001 non-null float64 13 purpose 1001 non-null float64 14 energy 1001 non-null float64 15 compensation 1001 non-null float64 16 satisfaction 1001 non-null float64 17 management 1001 non-null float64 18 trust 1001 non-null float64 19 belonging 1001 non-null float64 20 stress-free 1001 non-null float64 dtypes: float64(18), object(3) memory usage: 164.4+ KB
# ref: o365 Data Science - Linear Regression Practical Example (Part 5)
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize =(15,3)) #sharey -> share 'salary' as y
ax1.scatter(df['job_locations_available'],df['salary'])
ax1.set_title('Count of Job Locations with Salary')
ax2.scatter(df['happiness'],df['salary'])
ax2.set_title('Happiness Score with Salary')
ax3.scatter(df['company_rating'],df['salary'])
ax3.set_title('Company Rating with Salary')
plt.show()
#df_orig = df.copy(deep=True)
# revert
#df = df_orig.copy(deep=True)
df.head()
| job_title | salary | company | company_rating | job_locations_available | region | happiness | flexibility | learning | achievement | appreciation | inclusion | support | purpose | energy | compensation | satisfaction | management | trust | belonging | stress-free | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | data scientist | 175000.0 | Oracle | 3.2 | 1.0 | remote | 65.0 | 77.0 | 71.0 | 70.0 | 70.0 | 69.0 | 69.0 | 67.0 | 64.0 | 64.0 | 63.0 | 63.0 | 63.0 | 61.0 | 57.0 |
| 1 | data scientist | 90000.0 | The J. M. Smucker Company | 3.8 | 9.0 | remote | 55.0 | 55.0 | 60.0 | 62.0 | 61.0 | 55.0 | 59.0 | 59.0 | 56.0 | 69.0 | 54.0 | 52.0 | 52.0 | 50.0 | 58.0 |
| 2 | data scientist | 124800.0 | Starbucks | 3.9 | 4.0 | remote | 65.0 | 72.0 | 68.0 | 70.0 | 75.0 | 73.0 | 73.0 | 70.0 | 67.0 | 60.0 | 64.0 | 64.0 | 66.0 | 65.0 | 46.0 |
| 3 | data scientist | 90000.0 | Salesforce | 4.1 | 3.0 | remote | 76.0 | 84.0 | 80.0 | 78.0 | 82.0 | 79.0 | 80.0 | 78.0 | 74.0 | 82.0 | 75.0 | 75.0 | 76.0 | 74.0 | 58.0 |
| 4 | data scientist | 155000.0 | Nordstrom Inc | 3.7 | 17.0 | remote | 67.0 | 69.0 | 68.0 | 72.0 | 71.0 | 68.0 | 70.0 | 70.0 | 68.0 | 67.0 | 64.0 | 65.0 | 64.0 | 64.0 | 56.0 |
# create dummy variables on region
df = pd.get_dummies(df, columns=['region', 'job_title'], drop_first=True, dtype=float)
# dropping company as it's too much noise for this project (talked with Lisa)
df.drop(columns=['company'], inplace=True)
# how is the data correlated to salary
# looking for 0.5 and greater to remove - only between features and not including target
df.corr().sort_values(by='salary', ascending=False)
| salary | company_rating | job_locations_available | happiness | flexibility | learning | achievement | appreciation | inclusion | support | purpose | energy | compensation | satisfaction | management | trust | belonging | stress-free | region_northeast | region_remote | region_south | region_west | job_title_machine learning engineer | job_title_mathematician | job_title_statistician | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| salary | 1.000000 | -0.035209 | 0.027299 | -0.003748 | 0.039047 | 0.006854 | -0.002857 | -0.003891 | 0.003997 | -0.004927 | -0.006594 | -0.013706 | 0.001003 | -0.011819 | -0.008530 | 0.001644 | -0.004187 | 0.014347 | 0.024355 | 0.002013 | -0.019834 | -0.003959 | 0.017243 | 0.079121 | 0.048841 |
| job_title_mathematician | 0.079121 | 0.074720 | -0.010744 | -0.014968 | -0.011187 | -0.020990 | -0.021811 | -0.010542 | -0.020796 | -0.013899 | -0.024230 | -0.015782 | -0.015097 | -0.003742 | -0.019020 | -0.015457 | -0.013399 | 0.013841 | -0.019806 | -0.025471 | -0.028022 | 0.031075 | -0.009364 | 1.000000 | -0.013717 |
| job_title_statistician | 0.048841 | -0.034587 | -0.058344 | -0.045100 | 0.174143 | 0.056015 | 0.053213 | 0.081933 | -0.017293 | 0.053876 | 0.027750 | -0.003817 | 0.013162 | -0.133640 | 0.101005 | 0.052379 | 0.027025 | 0.203816 | 0.133974 | -0.124778 | 0.014044 | -0.125620 | -0.064158 | -0.013717 | 1.000000 |
| flexibility | 0.039047 | -0.051267 | -0.032607 | 0.776011 | 1.000000 | 0.687222 | 0.744167 | 0.646278 | 0.742601 | 0.578168 | 0.765470 | 0.775657 | 0.539915 | 0.706887 | 0.616523 | 0.739100 | 0.713359 | 0.523343 | 0.017008 | -0.080955 | -0.055417 | 0.152827 | 0.088029 | -0.011187 | 0.174143 |
| job_locations_available | 0.027299 | -0.036188 | 1.000000 | -0.023062 | -0.032607 | -0.015154 | -0.024949 | -0.034677 | -0.013884 | -0.033486 | -0.013859 | -0.017491 | -0.032145 | -0.002183 | -0.042976 | -0.023492 | -0.015386 | -0.025365 | 0.028121 | 0.013813 | -0.016808 | 0.023135 | -0.028299 | -0.010744 | -0.058344 |
| region_northeast | 0.024355 | 0.013871 | 0.028121 | 0.021141 | 0.017008 | 0.046181 | 0.050004 | 0.053452 | 0.031423 | 0.046559 | 0.062557 | 0.022505 | 0.070811 | 0.025556 | 0.075285 | 0.036639 | 0.050446 | -0.002618 | 1.000000 | -0.251989 | -0.277217 | -0.231509 | 0.149684 | -0.019806 | 0.133974 |
| job_title_machine learning engineer | 0.017243 | 0.032192 | -0.028299 | 0.084923 | 0.088029 | 0.053536 | 0.075869 | 0.084940 | 0.057561 | 0.078751 | 0.088502 | 0.082214 | 0.093049 | 0.074387 | 0.092915 | 0.065904 | 0.082744 | 0.067925 | 0.149684 | -0.107545 | 0.057243 | -0.048787 | 1.000000 | -0.009364 | -0.064158 |
| stress-free | 0.014347 | 0.011183 | -0.025365 | 0.547008 | 0.523343 | 0.282221 | 0.421768 | 0.254171 | 0.307720 | 0.187314 | 0.451016 | 0.572823 | 0.249734 | 0.407341 | 0.236964 | 0.332053 | 0.348737 | 1.000000 | -0.002618 | -0.047985 | -0.018134 | 0.066971 | 0.067925 | 0.013841 | 0.203816 |
| learning | 0.006854 | -0.012317 | -0.015154 | 0.870397 | 0.687222 | 1.000000 | 0.903213 | 0.948274 | 0.927836 | 0.932755 | 0.904552 | 0.854605 | 0.827494 | 0.869481 | 0.920280 | 0.947524 | 0.949723 | 0.282221 | 0.046181 | -0.102587 | -0.036593 | 0.187103 | 0.053536 | -0.020990 | 0.056015 |
| inclusion | 0.003997 | -0.016445 | -0.013884 | 0.902720 | 0.742601 | 0.927836 | 0.896801 | 0.930087 | 1.000000 | 0.911187 | 0.891853 | 0.871500 | 0.792636 | 0.908077 | 0.909596 | 0.969069 | 0.961463 | 0.307720 | 0.031423 | -0.076049 | -0.075068 | 0.232517 | 0.057561 | -0.020796 | -0.017293 |
| region_remote | 0.002013 | 0.017334 | 0.013813 | -0.110669 | -0.080955 | -0.102587 | -0.092731 | -0.080517 | -0.076049 | -0.059921 | -0.128343 | -0.119135 | -0.109299 | -0.086630 | -0.059327 | -0.089056 | -0.105589 | -0.047985 | -0.251989 | 1.000000 | -0.356519 | -0.297735 | -0.107545 | -0.025471 | -0.124778 |
| trust | 0.001644 | -0.013003 | -0.023492 | 0.910686 | 0.739100 | 0.947524 | 0.927137 | 0.964242 | 0.969069 | 0.948238 | 0.910933 | 0.880021 | 0.825302 | 0.912438 | 0.946891 | 1.000000 | 0.981367 | 0.332053 | 0.036639 | -0.089056 | -0.052473 | 0.198146 | 0.065904 | -0.015457 | 0.052379 |
| compensation | 0.001003 | -0.018665 | -0.032145 | 0.758986 | 0.539915 | 0.827494 | 0.804472 | 0.824990 | 0.792636 | 0.828123 | 0.781267 | 0.732930 | 1.000000 | 0.793079 | 0.809801 | 0.825302 | 0.832830 | 0.249734 | 0.070811 | -0.109299 | -0.019308 | 0.158483 | 0.093049 | -0.015097 | 0.013162 |
| achievement | -0.002857 | -0.020792 | -0.024949 | 0.923023 | 0.744167 | 0.903213 | 1.000000 | 0.898248 | 0.896801 | 0.869246 | 0.916926 | 0.920060 | 0.804472 | 0.897027 | 0.872263 | 0.927137 | 0.932330 | 0.421768 | 0.050004 | -0.092731 | -0.032869 | 0.174742 | 0.075869 | -0.021811 | 0.053213 |
| happiness | -0.003748 | -0.016834 | -0.023062 | 1.000000 | 0.776011 | 0.870397 | 0.923023 | 0.848199 | 0.902720 | 0.808219 | 0.943573 | 0.979595 | 0.758986 | 0.938607 | 0.811637 | 0.910686 | 0.931611 | 0.547008 | 0.021141 | -0.110669 | -0.022073 | 0.221910 | 0.084923 | -0.014968 | -0.045100 |
| appreciation | -0.003891 | -0.005884 | -0.034677 | 0.848199 | 0.646278 | 0.948274 | 0.898248 | 1.000000 | 0.930087 | 0.980155 | 0.862865 | 0.819120 | 0.824990 | 0.858801 | 0.960125 | 0.964242 | 0.954249 | 0.254171 | 0.053452 | -0.080517 | -0.025129 | 0.142968 | 0.084940 | -0.010542 | 0.081933 |
| region_west | -0.003959 | -0.028116 | 0.023135 | 0.221910 | 0.152827 | 0.187103 | 0.174742 | 0.142968 | 0.232517 | 0.129185 | 0.205103 | 0.218019 | 0.158483 | 0.212126 | 0.109005 | 0.198146 | 0.215169 | 0.066971 | -0.231509 | -0.297735 | -0.327543 | 1.000000 | -0.048787 | 0.031075 | -0.125620 |
| belonging | -0.004187 | -0.010699 | -0.015386 | 0.931611 | 0.713359 | 0.949723 | 0.932330 | 0.954249 | 0.961463 | 0.935044 | 0.932864 | 0.908756 | 0.832830 | 0.931641 | 0.935930 | 0.981367 | 1.000000 | 0.348737 | 0.050446 | -0.105589 | -0.052334 | 0.215169 | 0.082744 | -0.013399 | 0.027025 |
| support | -0.004927 | 0.004184 | -0.033486 | 0.808219 | 0.578168 | 0.932755 | 0.869246 | 0.980155 | 0.911187 | 1.000000 | 0.828341 | 0.770359 | 0.828123 | 0.837744 | 0.972414 | 0.948238 | 0.935044 | 0.187314 | 0.046559 | -0.059921 | -0.021328 | 0.129185 | 0.078751 | -0.013899 | 0.053876 |
| purpose | -0.006594 | -0.019692 | -0.013859 | 0.943573 | 0.765470 | 0.904552 | 0.916926 | 0.862865 | 0.891853 | 0.828341 | 1.000000 | 0.936246 | 0.781267 | 0.920604 | 0.832182 | 0.910933 | 0.932864 | 0.451016 | 0.062557 | -0.128343 | -0.039175 | 0.205103 | 0.088502 | -0.024230 | 0.027750 |
| management | -0.008530 | -0.004197 | -0.042976 | 0.811637 | 0.616523 | 0.920280 | 0.872263 | 0.960125 | 0.909596 | 0.972414 | 0.832182 | 0.775567 | 0.809801 | 0.834286 | 1.000000 | 0.946891 | 0.935930 | 0.236964 | 0.075285 | -0.059327 | -0.029335 | 0.109005 | 0.092915 | -0.019020 | 0.101005 |
| satisfaction | -0.011819 | -0.008328 | -0.002183 | 0.938607 | 0.706887 | 0.869481 | 0.897027 | 0.858801 | 0.908077 | 0.837744 | 0.920604 | 0.907119 | 0.793079 | 1.000000 | 0.834286 | 0.912438 | 0.931641 | 0.407341 | 0.025556 | -0.086630 | -0.031310 | 0.212126 | 0.074387 | -0.003742 | -0.133640 |
| energy | -0.013706 | -0.021364 | -0.017491 | 0.979595 | 0.775657 | 0.854605 | 0.920060 | 0.819120 | 0.871500 | 0.770359 | 0.936246 | 1.000000 | 0.732930 | 0.907119 | 0.775567 | 0.880021 | 0.908756 | 0.572823 | 0.022505 | -0.119135 | -0.021074 | 0.218019 | 0.082214 | -0.015782 | -0.003817 |
| region_south | -0.019834 | 0.015950 | -0.016808 | -0.022073 | -0.055417 | -0.036593 | -0.032869 | -0.025129 | -0.075068 | -0.021328 | -0.039175 | -0.021074 | -0.019308 | -0.031310 | -0.029335 | -0.052473 | -0.052334 | -0.018134 | -0.277217 | -0.356519 | 1.000000 | -0.327543 | 0.057243 | -0.028022 | 0.014044 |
| company_rating | -0.035209 | 1.000000 | -0.036188 | -0.016834 | -0.051267 | -0.012317 | -0.020792 | -0.005884 | -0.016445 | 0.004184 | -0.019692 | -0.021364 | -0.018665 | -0.008328 | -0.004197 | -0.013003 | -0.010699 | 0.011183 | 0.013871 | 0.017334 | 0.015950 | -0.028116 | 0.032192 | 0.074720 | -0.034587 |
sns.pairplot(df.drop(columns=['salary']))
<seaborn.axisgrid.PairGrid at 0x12da493d0>
features = [
'company_rating',
'job_locations_available',
# 'happiness',
'flexibility',
# 'learning',
'achievement',
# 'appreciation',
# 'inclusion',
# 'support',
'purpose',
# 'energy',
'compensation',
# 'satisfaction',
'management',
# 'trust',
# 'belonging',
'stress-free',
'region_northeast',
'region_remote',
'region_south',
'region_west',
'job_title_machine learning engineer',
'job_title_mathematician',
'job_title_statistician'
]
# process: calculate and then view the VIF
# start commenting out the highest VIFs one by one until all other VIFs are in an acceptable range (< 10)
calculate_vif(df=df, features=features).sort_values(by='VIF', ascending=False)#.index.to_list()
| VIF | Tolerance | |
|---|---|---|
| achievement | 9.399643 | 0.106387 |
| purpose | 8.217864 | 0.121686 |
| management | 5.810678 | 0.172097 |
| compensation | 3.498810 | 0.285811 |
| flexibility | 2.980069 | 0.335563 |
| region_south | 2.973245 | 0.336333 |
| region_remote | 2.871602 | 0.348238 |
| region_west | 2.802149 | 0.356869 |
| region_northeast | 2.399556 | 0.416744 |
| stress-free | 1.633134 | 0.612320 |
| job_title_statistician | 1.255781 | 0.796317 |
| job_title_machine learning engineer | 1.070066 | 0.934522 |
| company_rating | 1.017819 | 0.982493 |
| job_locations_available | 1.014491 | 0.985716 |
| job_title_mathematician | 1.014200 | 0.985999 |
plt.figure(figsize=(20,10))
ax = sns.heatmap(df[features].corr(),cmap="seismic", annot=True, vmin=-1, vmax=1)
ax.set_title("Correlations Between Chosen Features", fontsize=20);
# ref: lesson on cross-validation and regression exercises
# NOTES: fit AND score on train, scoring only (not fit) on validation
y = df['salary'] # target
X = df[features] # features
# ref: validation_workflow_and_utilities from cross-validate lessons
# 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# print shape for reference in case of shape errors later
print(X.shape, y.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(1001, 15) (1001,) (800, 15) (800,) (201, 15) (201,)
# ref: regression_practice_solution
model = LinearRegression()
lr = model.fit(X_train,y_train)
r2_train_l = lr.score(X_train, y_train)
r2_test_l = lr.score(X_test, y_test)
#r2_adj_train_l = 1 - (1-lr.score(X_train, y_train))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
#r2_adj_test_l = 1 - (1-lr.score(X_test, y_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
y_pred_train_l = lr.predict(X_train)
y_pred_test_l = lr.predict(X_test)
rsme_training_data_l = sqrt(mean_squared_error(y_train, y_pred_train_l))
rsme_test_data_l = sqrt(mean_squared_error(y_test, y_pred_test_l))
y_pred_train_l.shape, y_pred_test_l.shape
((800,), (201,))
print(f"R² training score: {r2_train_l}")
print(f"R² test score: {r2_test_l}")
#print(f"Adjusted R² training score: {r2_adj_train_l}")
#print(f"Adjusted R² test score: {r2_adj_test_l}")
print(f"RMSE on training data: {rsme_training_data_l}")
print(f"RMSE on test data: {rsme_test_data_l}")
R² training score: 0.022003228592472723 R² test score: -0.04021372779066179 RMSE on training data: 43960.993166465785 RMSE on test data: 47298.627064668064
# Create polynomial features
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
# Apply polynomial transformation to test set
X_test_poly = poly.transform(X_test)
# Fit a model using polynomial features
lr_poly = LinearRegression()
lr_poly.fit(X_train_poly,y_train)
r2_train_p = lr_poly.score(X_train_poly,y_train)
r2_test_p = lr_poly.score(X_test_poly, y_test)
y_pred_train_p = lr_poly.predict(X_train_poly)
y_pred_test_p = lr_poly.predict(X_test_poly)
rsme_training_data_p = sqrt(mean_squared_error(y_train, y_pred_train_p))
rsme_test_data_p = sqrt(mean_squared_error(y_test, y_pred_test_p))
print(f"R² training score: {r2_train_p}")
print(f"R² test score: {r2_test_p}")
print(f"RMSE on training data: {rsme_training_data_p}")
print(f"RMSE on test data: {rsme_test_data_p}")
R² training score: 0.133336554185415 R² test score: -0.05930605731009586 RMSE on training data: 41383.19567506024 RMSE on test data: 47730.71851318308
# Don't forget to standard scale your data for regularized regression
scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
# Apply the same scaler to val set
X_val_test_scaled = scaler.transform(X_test_poly)
# Fit a LASSO model using polynomial features
import warnings
warnings.filterwarnings('ignore')
lasso = LassoCV()
lasso.fit(X_train_poly_scaled,y_train)
r2_train_las = lasso.score(X_train_poly_scaled,y_train)
r2_test_las = lasso.score(X_val_test_scaled, y_test)
y_pred_train_las = lasso.predict(X_train_poly)
y_pred_test_las = lasso.predict(X_val_test_scaled)
rsme_training_data_las = sqrt(mean_squared_error(y_train, y_pred_train_las))
rsme_test_data_las = sqrt(mean_squared_error(y_test, y_pred_test_las))
print(f"R² training score: {r2_train_las}")
print(f"R² test score: {r2_test_las}")
print(f"RMSE on training data: {y_pred_train_las[0]}")
print(f"RMSE on test data: {y_pred_test_las[0]}")
R² training score: 0.010465265273279845 R² test score: -0.015492087930060405 RMSE on training data: 147142.43625 RMSE on test data: 146871.88121030302
# based on https://www.kaggle.com/code/jack89roberts/top-7-using-elasticnet-with-interactions/notebook
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
# fit model
lr = Ridge()
lr.fit(X_train, y_train)
r2_train_r = lr.score(X_train, y_train)
r2_test_r = lr.score(X_test, y_test)
y_pred_train_r = lr.predict(X_train)
y_pred_test_r = lr.predict(X_test)
rsme_training_data_r = sqrt(mean_squared_error(y_train, y_pred_train_r))
rsme_test_data_r = sqrt(mean_squared_error(y_test, y_pred_test_r))
print(f"R² training score: {r2_train_r}")
print(f"R² test score: {r2_test_r}")
print(f"RMSE on training data: {y_pred_train_r[0]}")
print(f"RMSE on test data: {y_pred_test_r[0]}")
# check model results
lr_coefs = pd.Series(lr.coef_,index=X_train.columns)
print('----------------')
print('Intercept:',lr.intercept_)
print('----------------')
print(lr_coefs.sort_values(ascending=False))
print('----------------')
print('R²:',lr.score(X_train, y_train))
print("Adjusted R²:",1 - (1-lr.score(X_train, y_train))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('----------------')
y_pred = lr.predict(X_test)
plt.figure(figsize=(25,7))
plt.subplot(1,3,1)
sns.regplot(y_test, y_pred,
scatter_kws={'marker':'.','s':3,'alpha':0.7,'color':'r'},
line_kws={'color':'k'})
#plt.plot(y_train, y_pred, '.')
plt.xlabel('y_train')
plt.ylabel('y_pred');
plt.title('Actual Value vs. Prediction')
plt.subplot(1,3,2)
sns.regplot(y_test, y_test - y_pred,
scatter_kws={'marker':'.','s':3,'alpha':0.7,'color':'r'},
line_kws={'color':'k'})
#plt.plot(y_train, y_train - y_pred, '.')
plt.xlabel('y_train')
plt.ylabel('y_test - y_pred');
plt.title('Residual vs. Actual Value')
y = lr.predict(X)
plt.subplot(1,3,3)
sns.histplot(y, bins=100)
#plt.hist(y,bins=100)
plt.title('Salary Count Spread')
R² training score: 0.019105673058912198 R² test score: -0.018484281874954434 RMSE on training data: 148533.24874272235 RMSE on test data: 141663.25369822685 ---------------- Intercept: 146721.63882069563 ---------------- job_title_mathematician 66637.011376 job_title_statistician 10040.748809 region_remote 4810.934897 region_west 3103.272313 region_northeast 3023.059822 region_south 1738.284115 job_title_machine learning engineer 979.827660 flexibility 457.739263 stress-free 117.263732 compensation 81.629283 job_locations_available 38.915033 purpose -41.123515 management -175.167783 achievement -321.952999 company_rating -3114.404182 dtype: float64 ---------------- R²: 0.019105673058912198 Adjusted R²: 0.0003385622118250442 ----------------
Text(0.5, 1.0, 'Salary Count Spread')